import scanpy as sc, anndata as ad, numpy as np, pandas as pd
from scipy import sparse
from anndata import AnnData
import warnings
import socket
import seaborn as sns
import itertools
import holoviews as hv
import plotly.express as px
import kaleido
import scanpy.external as sce
from matplotlib import pylab
import sys
import yaml
import os
from pandas.api.types import CategoricalDtype
import plotly
import random
import matplotlib.pyplot as plt
plotly.offline.init_notebook_mode()
get_ipython().run_line_magic('matplotlib', 'inline')
sc.settings.verbosity = 3 # verbosity: errors (0), warnings (1), info (2), hints (3)
sc.logging.print_header()
sc.settings.set_figure_params(dpi=50, facecolor='white')
pylab.rcParams['figure.figsize'] = (4, 4)
scanpy==1.8.1 anndata==0.7.6 umap==0.4.6 numpy==1.20.2 scipy==1.6.3 pandas==1.2.4 scikit-learn==0.24.2 statsmodels==0.13.1 python-igraph==0.9.8 louvain==0.7.1 pynndescent==0.5.5
with open(os.path.expanduser('~')+"/paths_config.yaml", 'r') as f:
paths = yaml.load(f, Loader=yaml.FullLoader)
#indir=paths["paths"]["indir"][hostRoot]
outdir="./outdir"
#projectBaseDir=paths["paths"]["projectBaseDir"][hostRoot]
adataPath = outdir+"/3_polaroid_quickAnno.h5ad"
adataRawPath = outdir+"/1_polaroid_mint.h5ad"
BadhuriPath = outdir+"/NowaCurated.h5ad"
BadhuriRawPath = "./data/Badhuri_atlas.h5ad"
figDir = "./Figures_publication"
#outBaseName = "03.2B_Dimreduct_inDepthCompositions"
leiden3Mapping = {"0":"POL_Encs-1",
"1":"POL_CBC/BRCs-1",
"2":"POL_EXN-1",
"3":"POL_EnCs-2",
"4":"POL_RGCs-1",
"5":"POL_RGCs-2",
"6":"POL_ccRGCs",
"7":"POL_inPCs",
"8":"POL_EXN-2",
"9":"POL_RtCs",
"10":"POL_ER-Cs",
"11":"POL_CBC/BRCs-2"}
badhuriClMapping = {
'ipc':"BADH_ipc",
'interneuron':"BADH_interneuron",
'dividing':"BADH_dividing",
'opc':"BADH_opc",
'radial':"BADH_radial",
'cr':"BADH_cr",
'excitatory':"BADH_excitatory",
'endothelial':"BADH_endothelial"
}
BadhuriCategoriesOrder = ["BADH_dividing","BADH_radial","BADH_opc","BADH_ipc","BADH_cr","BADH_excitatory","BADH_interneuron","BADH_endothelial"]
PolaroidsCategoriesorder = ["POL_CRGC","POL_RGC1","POL_RGC2","POL_RCs","POL_EXN1","POL_EXN2","POL_InN","POL_ER","POL_CPC1","POL_CPC2","POL_ChP/CBC/BRC-1","POL_ChP/CBC/BRC-2"]
adata = sc.read_h5ad(adataPath)
adata.obs["leidenAnnotated"] = adata.obs["leiden0.3"].replace(leiden3Mapping).astype("category").cat.reorder_categories(list(leiden3Mapping.values()))
adata.uns["leidenAnnotated_colors"] = adata.uns["leiden0.3_colors"].copy()
sc.pl.umap(adata, color=["leiden0.3","leidenAnnotated"])
adata.raw.to_adata().X.data
adataRaw = sc.read_h5ad(adataRawPath)
adata.X = adataRaw[adata.obs_names, adata.var_names].X
del adataRaw
adataBadhuri_raw = sc.read_h5ad(BadhuriRawPath)
varMap = adataBadhuri_raw.var.copy()
adataBadhuri_raw = adataBadhuri_raw.raw.to_adata()
adataBadhuri_raw.var = adataBadhuri_raw.var.merge(varMap.drop(columns=["feature_biotype"]), left_index=True, right_index=True, how="left").set_index("feature_name")
adataBadhuri_raw.var.index.name = None
adataBadhuri = sc.read_h5ad(BadhuriPath)
adataBadhuri = adataBadhuri.raw.to_adata()
adataBadhuri.X = adataBadhuri_raw[adataBadhuri.obs_names, adataBadhuri.var_names].X
adataBadhuri.obs["cluster_label_aggr"] = adataBadhuri.obs["cluster_label_aggr"].replace(badhuriClMapping)
del adataBadhuri_raw
/usr/local/lib/python3.8/dist-packages/anndata/_core/anndata.py:794: UserWarning:
AnnData expects .var.index to contain strings, but got values like:
['RP11-34P13.7', 'AP006222.1', 'RP4-669L17.4', 'LINC01409', 'LINC00115']
Inferred to be: categorical
# Subset for common genes
Commongenes = adataBadhuri.var_names.intersection(adata.var_names)
adataBadhuri_obs_backup = adataBadhuri.obs.copy()
adata_obs_backup = adata.obs.copy()
# Store relevant metadata
adataBadhuri.obs["leidenAnnotated"] = adataBadhuri.obs["cluster_label_aggr"].copy()
#adata.obs["leidenAnnotated"] = adata.obs["leiden0.3"].replace(leiden3Mapping).copy()
adataBadhuri.obs["leidenAnnotated_area"] = adataBadhuri.obs["leidenAnnotated"].astype(str) +"_"+ adataBadhuri.obs["cortical_area"].astype(str)
adata.obs["leidenAnnotated_area"] = adata.obs["leidenAnnotated"].astype(str) +"_"+ adata.obs["region"].astype(str)
adataBadhuri.obs = adataBadhuri.obs[["leidenAnnotated","leidenAnnotated_area"]]
adata.obs = adata.obs[["leidenAnnotated","leidenAnnotated_area"]]
adataBadhuri.obs["dataset"] = "Badhuri"
adata.obs["dataset"] = "Polaroids"
#Keep only relevant cols
adataBadhuri.obs = adataBadhuri.obs[["leidenAnnotated","leidenAnnotated_area","dataset"]]
adata.obs = adata.obs[["leidenAnnotated","leidenAnnotated_area","dataset"]]
CombinedAdata = ad.concat([adataBadhuri[:,Commongenes],adata[:,Commongenes]])
del adataBadhuri
/usr/local/lib/python3.8/dist-packages/anndata/_core/merge.py:895: UserWarning: Only some AnnData objects have `.raw` attribute, not concatenating `.raw` attributes.
CombinedAdata.layers["mint"] = CombinedAdata.X.copy()
sc.pp.normalize_total(CombinedAdata, target_sum = 20000)
CombinedAdata.layers["libNorm"] = CombinedAdata.X.copy()
sc.pp.log1p(CombinedAdata)
CombinedAdata.layers["libNorm_logNorm"] = CombinedAdata.X.copy()
CombinedAdata_BU = CombinedAdata.copy()
#CombinedAdata_BU.write_h5ad(outdir+"/Correlaton_preBulk.h5ad")
normalizing counts per cell
finished (0:00:04)
PseudooReplicates_per_group = 5
#CombinedAdata_BU = sc.read_h5ad(outdir+"/Correlaton_preBulk.h5ad")
CombinedAdata = CombinedAdata_BU.copy()
CombinedAdata.obs["leidenAnnotated_area_aggregated"] = CombinedAdata.obs["leidenAnnotated_area"].astype("str").replace({"_proximal":"","_medial":"","_distal":"",
"_piece1":"","_piece2":"","_piece3":""},
regex=True).astype("category")
groupingCovariate = "leidenAnnotated_area_aggregated"
print("Pbulking with target of "+str(PseudooReplicates_per_group)+" PRs will result in following cells per PR")
CombinedAdata.obs.groupby(groupingCovariate).size() / PseudooReplicates_per_group
Pbulking with target of 5 PRs will result in following cells per PR
leidenAnnotated_area_aggregated BADH_cr_motor 18.6 BADH_cr_parietal 3.2 BADH_cr_pfc 23.0 BADH_cr_somatosensory 3.2 BADH_cr_temporal 13.2 BADH_cr_v1 21.0 BADH_dividing_motor 1598.0 BADH_dividing_parietal 1342.4 BADH_dividing_pfc 960.0 BADH_dividing_somatosensory 1419.4 BADH_dividing_temporal 1155.6 BADH_dividing_v1 1433.6 BADH_endothelial_motor 13.8 BADH_endothelial_parietal 3.2 BADH_endothelial_pfc 14.0 BADH_endothelial_somatosensory 9.8 BADH_endothelial_temporal 7.4 BADH_endothelial_v1 9.8 BADH_excitatory_motor 5371.0 BADH_excitatory_parietal 6073.4 BADH_excitatory_pfc 9463.6 BADH_excitatory_somatosensory 2654.2 BADH_excitatory_temporal 3947.2 BADH_excitatory_v1 4911.8 BADH_interneuron_motor 3947.0 BADH_interneuron_parietal 2328.2 BADH_interneuron_pfc 2280.0 BADH_interneuron_somatosensory 3180.0 BADH_interneuron_temporal 1538.0 BADH_interneuron_v1 1097.6 BADH_ipc_motor 1533.6 BADH_ipc_parietal 2489.2 BADH_ipc_pfc 1360.6 BADH_ipc_somatosensory 1849.4 BADH_ipc_temporal 1197.0 BADH_ipc_v1 2188.0 BADH_opc_motor 287.2 BADH_opc_parietal 498.0 BADH_opc_pfc 313.8 BADH_opc_somatosensory 305.8 BADH_opc_temporal 111.8 BADH_opc_v1 143.0 BADH_radial_motor 1354.4 BADH_radial_parietal 2498.8 BADH_radial_pfc 1494.2 BADH_radial_somatosensory 1434.2 BADH_radial_temporal 1450.6 BADH_radial_v1 1383.6 POL_CBC/BRCs-1 522.8 POL_CBC/BRCs-2 42.8 POL_ER-Cs 82.2 POL_EXN-1 477.6 POL_EXN-2 252.4 POL_EnCs-2 438.8 POL_Encs-1 688.6 POL_RGCs-1 327.6 POL_RGCs-2 308.8 POL_RtCs 97.4 POL_ccRGCs 269.4 POL_inPCs 256.0 dtype: float64
total = pd.DataFrame(index = CombinedAdata.var_names)
total_metadata = pd.DataFrame(index= ["_".join(Rep) for Rep in list(itertools.product(CombinedAdata.obs[groupingCovariate].unique().tolist(), [str(r) for r in list(range(PseudooReplicates_per_group))]))])
for group in CombinedAdata.obs[groupingCovariate].unique().tolist():
groupAdata = CombinedAdata[CombinedAdata.obs[groupingCovariate] == group]
group_cells = groupAdata.obs_names.tolist()
random.Random(4).shuffle(group_cells)
metaCellslist=[group_cells[i::PseudooReplicates_per_group] for i in range(PseudooReplicates_per_group)]
for partition in enumerate(metaCellslist):
total['{}_{}'.format(group, partition[0])] = CombinedAdata[partition[1]].layers["libNorm"].sum(axis = 0).A1
total_metadata.loc['{}_{}'.format(group, partition[0]),"group"] = group
total_metadata.loc['{}_{}'.format(group, partition[0]),"pseudoreplicate"] = partition[0]
total_metadata.loc['{}_{}'.format(group, partition[0]),"number_of_cell"] = int(len(partition[1]))
#With this line we propagate - whenever possible - obs to aggregated pReplicate
for obsMD in [obsMD for obsMD in groupAdata.obs.columns.tolist() if len(groupAdata.obs[obsMD].unique()) == 1 and obsMD != groupingCovariate]:
total_metadata.loc[["_".join(l) for l in list(itertools.product([group],[str(r) for r in list(range(PseudooReplicates_per_group))]))], obsMD ] = CombinedAdata.obs.loc[CombinedAdata.obs[groupingCovariate] == group,obsMD][0]
total_metadata = total_metadata.dropna(axis = 1)
adata_bulk = sc.AnnData(total.transpose()).copy()
adata_bulk.var = CombinedAdata.var.copy()
adata_bulk.obs = pd.concat([adata_bulk.obs, total_metadata], axis = 1)
sc.pp.normalize_total(adata_bulk, target_sum = 20000)
sc.pp.log1p(adata_bulk)
normalizing counts per cell
finished (0:00:00)
from importlib import reload
import harmony_mod
from harmony_mod import *
reload(harmony_mod)
from harmony_mod import *
npcs=10
adata_bulk_DS = adata_bulk.copy()
#adata_bulk_DS = adata_bulk_DS[adata_bulk_DS.obs.dataset.isin(["Polaroids"])]
HVGs = sc.pp.highly_variable_genes(adata_bulk_DS,n_top_genes=2000,batch_key="dataset", inplace =False)
HVGs = HVGs[HVGs.highly_variable_intersection].index.tolist()
adata_bulk_DS.var["highly_variable"] = False
adata_bulk_DS.var.loc[HVGs,"highly_variable"] = True
adata_bulk_DS = adata_bulk_DS[:,adata_bulk_DS.var["highly_variable"]]
sc.tl.pca(adata_bulk_DS, use_highly_variable=True)
sc.pl.pca_variance_ratio(adata_bulk_DS)
sc.pl.pca(adata_bulk_DS, color = ["group","dataset"], ncols = 1, add_outline=True, size = 20, title="Pre harmony PCA")
#adata_bulk_DS.obsm["X_pca"] = adata_bulk_DS.obsm["X_pca"][:,:npcs]
harmony_integrate_mod(adata_bulk_DS, 'dataset', adjusted_basis ="X_pca", theta = 4, max_iter_harmony = 20)
#sc.pp.neighbors(adata_bulk_DS, n_neighbors=5, n_pcs=10)
sc.pl.pca(adata_bulk_DS, color = ["group","dataset"], ncols = 1, add_outline=True, size = 20, title="Post harmony PCA")
If you pass `n_top_genes`, all cutoffs are ignored. extracting highly variable genes
... storing 'group' as categorical ... storing 'leidenAnnotated' as categorical ... storing 'dataset' as categorical
finished (0:00:00)
computing PCA
on highly variable genes
with n_comps=50
finished (0:00:00)
/usr/local/lib/python3.8/dist-packages/pandas/core/indexing.py:1637: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
WARNING: The title list is shorter than the number of panels. Using 'color' value instead for some plots.
/usr/local/lib/python3.8/dist-packages/scipy/cluster/vq.py:575: UserWarning: One of the clusters is empty. Re-run kmeans with a different initialization. 2023-03-28 10:29:17,141 - harmonypy - INFO - Iteration 1 of 20 2023-03-28 10:29:17,141 - harmonypy - INFO - Iteration 1 of 20 2023-03-28 10:29:17,347 - harmonypy - INFO - Iteration 2 of 20 2023-03-28 10:29:17,347 - harmonypy - INFO - Iteration 2 of 20 2023-03-28 10:29:17,400 - harmonypy - INFO - Iteration 3 of 20 2023-03-28 10:29:17,400 - harmonypy - INFO - Iteration 3 of 20 2023-03-28 10:29:17,424 - harmonypy - INFO - Iteration 4 of 20 2023-03-28 10:29:17,424 - harmonypy - INFO - Iteration 4 of 20 2023-03-28 10:29:17,459 - harmonypy - INFO - Iteration 5 of 20 2023-03-28 10:29:17,459 - harmonypy - INFO - Iteration 5 of 20 2023-03-28 10:29:17,476 - harmonypy - INFO - Iteration 6 of 20 2023-03-28 10:29:17,476 - harmonypy - INFO - Iteration 6 of 20 2023-03-28 10:29:17,535 - harmonypy - INFO - Iteration 7 of 20 2023-03-28 10:29:17,535 - harmonypy - INFO - Iteration 7 of 20 2023-03-28 10:29:17,554 - harmonypy - INFO - Converged after 7 iterations 2023-03-28 10:29:17,554 - harmonypy - INFO - Converged after 7 iterations
WARNING: The title list is shorter than the number of panels. Using 'color' value instead for some plots.
contrastCov = "group"
sc.tl.dendrogram(adata_bulk_DS, contrastCov, n_pcs=npcs)
#sc.pl.correlation_matrix(adata_bulk_DS, contrastCov, figsize=(40,30), show_correlation_numbers=True)
#sc.tl.dendrogram(adata_bulk_DS, 'group')
GroupCat = adata_bulk_DS.obs[contrastCov].cat.categories
BadhuriGroups = adata_bulk_DS.obs.loc[adata_bulk_DS.obs["dataset"] == "Badhuri",contrastCov].unique().tolist()
PolaroidsGroups = adata_bulk_DS.obs.loc[adata_bulk_DS.obs["dataset"] == "Polaroids",contrastCov].unique().tolist()
BadhuriGroupsOrdered = [i for i in adata_bulk_DS.uns["dendrogram_"+contrastCov]["categories_ordered"] if i in BadhuriGroups]
PolaroidsGroupsOrdered = [i for i in adata_bulk_DS.uns["dendrogram_"+contrastCov]["categories_ordered"] if i in PolaroidsGroups]
corrDF = pd.DataFrame(adata_bulk_DS.uns["dendrogram_"+contrastCov]["correlation_matrix"],
columns=GroupCat,
index=GroupCat)
corrDFTectangular = corrDF.loc[PolaroidsGroupsOrdered,BadhuriGroupsOrdered]
#plt.matshow(corrDFTectangular, cmap="bwr")
fig, ax = plt.subplots( figsize=(6, 4))
ax.pcolormesh(corrDFTectangular, cmap="PRGn")
ax.set_xlim(0, corrDFTectangular.shape[1])
ax.set_ylim(0, corrDFTectangular.shape[0])
ax.yaxis.tick_right()
ax.set_yticks(np.arange(corrDFTectangular.shape[0]) + 0.5)
ax.set_yticklabels(PolaroidsGroupsOrdered,size = 3)
#ax.xaxis.tick_right()
ax.xaxis.tick_top()
ax.xaxis.set_tick_params(labeltop=True)
ax.xaxis.set_tick_params(labelbottom=False)
ax.set_xticks(np.arange(corrDFTectangular.shape[1]) + 0.5)
ax.set_xticklabels(BadhuriGroupsOrdered, rotation=45, ha='left',size = 3)
#ax.set_xticks(corrDFTectangular.columns)
#ax.set_yticks(corrDFTectangular.index)
#ax.set_yticklabels(BadhuriGroupsOrdered)
#ax.set_xticklabels(PolaroidsGroupsOrdered)
for row, col in itertools.product(corrDFTectangular.index.tolist(),
corrDFTectangular.columns.tolist()):
ax.text(
corrDFTectangular.columns.get_loc(col) + 0.5,
corrDFTectangular.index.get_loc(row) + 0.5,
f"{corrDFTectangular.loc[row, col]:.2f}",
ha='center',
va='center',
size=2
)
fig.savefig(figDir+"/correlation_region.pdf", bbox_inches='tight')
corrDFTectangular.to_csv(outdir+"/Correlations/Corr_PCA_region.tsv",sep="\t")
using 'X_pca' with n_pcs = 10 Storing dendrogram info using `.uns['dendrogram_group']`